suppressPackageStartupMessages({
library(SummarizedExperiment)
library(SEtools)
library(ggplot2)
library(plgINS)
library(edgeR)
library(DT)
library(viridis)
library(RColorBrewer)
library(data.table)
library(dplyr)
library(tibble)
library(cowplot)
library(matrixStats)
library(RUVSeq)
})## Warning: replacing previous import 'GenomicRanges::shift' by 'data.table::shift'
## when loading 'plgINS'
# input file: Bartel DEA SE
input <- "data/bartel.hela.DEA.SE.rds"
#input <- "data/bartel.hek.DEA.SE.rds"
# output file: enrichMiR results
output.e <- "data/bartel.hela.e.TSperm.rds"
#output.e <- "data/bartel.hek.e.TSperm.rds"
# number of cores for high runtime functions
cores <- 10
# set number of replicates per permutation proportion
nrep <- 1# load enrichMiR package
devtools::load_all("/mnt/schratt/tgermade_test/master_19_20/enrichMiR/enrichMiR/")#' getDEA
#'
#' @param dea.df dataframe of DEA results (an SE rowData column if generated via DEA() function)
#'
#' @return FDR-filtered dataframe containing c("symbol","logFC","PValue","FDR") columns
#'
getDEA <- function(dea.df){
if(!any(colnames(dea.df) %in% "symbol")) dea.df$symbol <- rownames(dea.df)
dea <- as.data.frame(dea.df[,c("symbol","logFC", "logCPM","PValue","FDR")])
dea <- aggregate(dea[,-1], by=list(symbol=dea$symbol),FUN=mean)
rownames(dea) <- dea$symbol
dea <- dea[,-1]
lapply(dea, FUN = function(x){
if(any(is.infinite(x))){
w <- which(is.infinite(x))
x[w] <- max(abs(x[setdiff(which(dea$FDR<0.5),w)]))*sign(x[w])
}
})
return(dea)
}# generate dea object (high runtime!)
#dea.list <- lapply(dea.df.names, FUN = function(x) getDEA(rowData(se.hela)[[x]]))
dea.list <- bplapply(dea.list, getDEA,
BPPARAM = MulticoreParam(cores) )
names(dea.list) <- dea.names# we don't use miRNA expression values for the benchmark. This will make it more difficult # for the enrichMiR tests
mirexpr <- NULL#' getTS
#'
#' @param species character object. Can be "human", "mouse" or "rat"
#'
#' @return TargetScan miRNA target dataframe with family information in metadata()
#'
getTS <- function(species=c("human","mouse","rat")){
library(S4Vectors)
species <- match.arg(species)
# assign species ID
spec <- switch( species,
human = 9606,
mouse = 10090,
rat = 10116 )
# download TargetScan miRNA targeting dataset
tmp <- tempfile()
download.file(
"http://www.targetscan.org/vert_72/vert_72_data_download/Summary_Counts.all_predictions.txt.zip",
tmp)
unzip(tmp)
full <- fread("Summary_Counts.all_predictions.txt") #, sep = "\t", header = TRUE)
# limit to selected species
sub <- full[full$'Species ID' == spec,]
sub$score <- as.numeric(as.character(sub$'Cumulative weighted context++ score'))
# generate TargetScan dataframe
ts <- DataFrame(family = sub$'miRNA family',
rep.miRNA = sub$'Representative miRNA',
feature = sub$'Gene Symbol',
sites = sub$'Total num conserved sites',
score = as.numeric(as.character(sub$'Cumulative weighted context++ score'))
)
ts <- DataFrame(
aggregate(sub[,c("sites","score")], by=ts[,c("family","feature")], FUN=mean)
)
# download TargetScan miRNA families dataset
tmp <- tempfile()
download.file(
"http://www.targetscan.org/vert_72/vert_72_data_download/miR_Family_Info.txt.zip",
tmp)
unzip(tmp)
full <- fread("miR_Family_Info.txt") #, sep = "\t", header = TRUE)
# limit to selected species
sub <- full[full$'Species ID' == spec,]
fam <- sub$`Seed+m8`
names(fam) <- sub$`MiRBase ID`
# add family info to ts dataframe as attribute
metadata(ts)$families <- fam
# enrichMiR cant handle 0 values for sites feature
return(ts[ts$sites!=0,])
}# load detailed Bartel treatment info containing exact sequences
pert <- read.csv("data/bartel_treatments.txt", sep = "\t")
# we extract HeLa & HEK data (we disregard passenger strands as long as we're working with TargetScan)
colnames(pert) <- c("treatment","seq")
pert <- list(pert[1:17,], pert[37:48,])
names(pert) <- c("hela","hek")
if(grepl("hela", input)){
pert <- pert$hela
} else if(grepl("hek", input)){
pert <- pert$hek
}
# find out where the seed sequence begins
#unlist(gregexpr(pattern ="GAGGUAG",pert$seq))
#unlist(gregexpr(pattern ="GGAAUGU",pert$seq))
# extract seed sequences
pert$seed <- sapply(pert$seq, function(x) substring(x ,2, 8))
# get seed family for each treatment miRNA (true positive)
TP <- sapply(dea.names, FUN=function(x) unique(as.character(
pert$seed[grepl(paste0(x,"\\("), pert$treatment) | grepl(paste0(x,"$"), pert$treatment)]
)) )#' TSperm
#'
#' @param TS TargetScan DataFrame of miRNA tx targets
#' @param genes string vector of gene names
#' @param props vector of proportions, e.g. c(.2,.3,.4,.5)
#'
#' @return TS DataFrame with permuted miRNA targets
#'
TSperm <- function(TS, TP, genes, props){
# get TargetScan data for each treatment miRNA family
TS.part <- TS[TS$family==TP,]
lapply(props, FUN=function(p){
sn <- floor(p*nrow(TS.part))
genes <- genes[!genes %in% TS.part$feature]
i <- sample(seq_len(nrow(TS.part)), sn)
j <- sample(seq_len(length(genes)), sn)
TS.part$feature[i] <- as.character(genes[j])
TS <- rbind(TS.part, TS[TS$family!=TP,])
return(TS)
})
}# the permuatation proportions that should be computed for each DEA
names(i) <- i <- 1:nrep
props <- unlist(lapply( c("20"=0.2, "30"=0.3, "40"=0.4, "50"=0.5),
FUN=function(x) lapply(i, FUN=function(y) x)))
# do TS permutations
set.seed(1234)
TS.list <- bplapply(dea.names, FUN=function(i){
TSperm(TS, TP[i], rownames(se), props)
}, BPPARAM = MulticoreParam(cores) )
# naming
names(TS.list) <- dea.names
for(i in dea.names){
names(TS.list[[i]]) <- names(props)
}
# add original TS to permutated ones
for(i in dea.names){
TS.list[[i]][["original"]] <- TS
}
# place original at first place
for(i in dea.names){
TS.list[[i]] <- TS.list[[i]][c("original", names(props))]
}
props.all <- c(original="original", props)#tests <- c("areamir","overlap","michael","KS","KS2","MW")
tests <- NULL
#cores <- 2
# run enrichMiR on all objects of dea.list (high runtime!)
e.list <- bplapply(dea.names, FUN=function(i){
lapply(names(props.all), FUN=function(j){
enrichMiR(DEA=as.data.frame(dea.list[[i]]), TS=TS.list[[i]][[j]], miRNA.expression=mirexpr,
cleanNames=TRUE, tests=tests)
})
}, BPPARAM = MulticoreParam(cores) )
names(e.list) <- dea.names
for(i in dea.names){
names(e.list[[i]]) <- names(props.all)
}dat <- function(e, thresholds, TP){
dplyr::bind_rows(
lapply(e@res, FUN=function(x){
if("family" %in% colnames(x)) row.names(x) <- x$family
x$truth <- row.names(x) %in% TP
x$FDR[is.na(x$FDR)] <- 1
as.data.frame(
t(sapply(thresholds, FUN=function(i){
P <- sum(x$FDR<=i)
c( threshold=i,
P=P,
TP=sum(x$FDR<=i & x$truth),
FP=sum(x$FDR<=i & !x$truth),
TPR=sum(x$FDR<=i & x$truth)/sum(x$truth),
FPR=sum(x$FDR<=i & !x$truth)/sum(!x$truth),
FDR=ifelse(P>0,sum(x$FDR<=i & !x$truth)/P,0) )
}))
)
}), .id="method")
}# Benchmarking
devtools::load_all("../enrichMiR/enrichMiR/")
fams <- metadata(TS)$families
#dea <- deas[[1]]
#dea <- rowData(se.hela)$`DEA.let-7a`
#e <- enrichMiR(dea, TS)
#e <- e.list$`miR-122`
#TP <- unique(as.character(fams[grep("miR-144$|miR-144-",names(fams))]))
thresholds <- c(0,10^(-10:-3),0.005,0.01,0.025,0.05,0.075,0.1,0.15,0.2,0.25,0.3,0.5)
# get benchmark results
dat.list <- lapply(dea.names[1], FUN=function(x) lapply(names(e.list[[x]]), FUN=function(y) dat(e.list[[x]][[y]], thresholds, TP.list[[x]]) ))
ggplot(dat.list[[1]][[2]], aes(FPR, TPR, colour=method)) + geom_line() + geom_point(size=3) +
scale_x_log10()
#+ scale_colour_manual(values=cols)#' doBenchmark
#'
#' @param res enrichMiR test results (e object)
#' @param TP character vector: contains the seeds (families) for a miRNA treatment
#'
#' @return a dataframe containing scores for each enrichMiR test
#'
doBenchmark <- function(res, TP){
res <- lapply(res, FUN=function(x){
x <- x[order(x$FDR),]
if("family" %in% colnames(x)) row.names(x) <- x$family
x$truth <- row.names(x) %in% TP
x$FDR[is.na(x$FDR)] <- 1
x
})
data.frame( method=names(res),
detPPV = sapply(res, FUN=function(x) 1/which(x$truth)[1] ),
FP.atFDR05 = sapply(res, FUN=function(x) sum(!x$truth & x$FDR<0.05)),
log10QDiff = sapply(res, FUN=function(x){
tp1 <- -log10(x$FDR[which(x$truth)[1]])
fp1 <- -log10(x$FDR[which(!x$truth)[1]])
tp1-fp1
}),
log10QrelIncrease = sapply(res, FUN=function(x){
tp1 <- -log10(x$FDR[which(x$truth)[1]])
fp1 <- -log10(x$FDR[which(!x$truth)[1]])
(tp1-fp1)/(min(tp1,fp1))
}),
TP.atFDR05 = sapply(res, FUN=function(x) sum(x$truth[x$FDR<0.05]))
)
}# generate the benchmarking scores
BM.list <- lapply(dea.names, FUN=function(x){
lapply(names(e.list[[x]]), FUN=function(y){
doBenchmark(e.list[[x]][[y]]@res, TP[x])
})
})
# naming
names(BM.list) <- dea.names
for(x in dea.names){
names(BM.list[[x]]) <- names(e.list[[x]])
}
# generate a results df for plotting
BM.list2 <- lapply(BM.list, FUN=function(x) dplyr::bind_rows(x, .id = "prop.rep"))
BM.df <- dplyr::bind_rows(BM.list2, .id="treatment")
BM.df$prop <- unlist(lapply(strsplit(BM.df$prop.rep, "[.]"), FUN=function(x) x[1]))
BM.df$prop <- factor(BM.df$prop, levels=unique(BM.df$prop))
df <- BM.df# plots
for(i in c("detPPV","FP.atFDR05","log10QDiff","TP.atFDR05")){
cat("Score analysis: ", i,"\n\n")
## boxplot
print(
ggplot(df, aes(x=prop, y=df[[i]], fill=method)) + geom_boxplot() + ylab(i)
)
if(i!="TP.atFDR05"){
## density plot: score distribution per permutation
print(
ggplot(df, aes(log10(df[[i]]))) + geom_density(aes(fill=method), alpha=0.9) + facet_wrap(~prop) + xlab(i)
)
## violin plot
print(
ggplot(df, aes(x=prop, y=df[[i]], fill=method)) + geom_violin() + facet_wrap(~method) +
guides(fill=FALSE) + ylab(i) + geom_jitter(shape=16, position=position_jitter(0.2))
)
}
## violin plot median
print(
ggplot(df, aes(x=prop, y=df[[i]], fill=method)) + geom_violin(draw_quantiles = 0.5) + facet_wrap(~method) +
guides(fill=FALSE) + ylab(i)
)
}## Score analysis: detPPV
## Score analysis: FP.atFDR05
## Score analysis: log10QDiff
## Score analysis: TP.atFDR05
# mean scores over all datasets; this shows which datasets were difficult to handle for the tests
ggplot(df, aes(x=prop, y=detPPV, fill=treatment)) + geom_violin(draw_quantiles = 0.5) + facet_wrap(~treatment) +
guides(fill=FALSE)mean.datasets <- lapply( unique(df$treatment), FUN=function(x)
sapply( unique(df$prop), function(y)
round( mean(df$detPPV[df$treatment==x & df$prop==y], na.rm=TRUE), 3)
)
)
names(mean.datasets) <- dea.names
for(i in dea.names){
names(mean.datasets[[i]]) <- props.all
}
mean.datasets## $`let-7a`
## original 0.2 0.3 0.4 0.5
## 0.768 0.768 0.768 0.768 0.701
##
## $`miR-1`
## original 0.2 0.3 0.4 0.5
## 0.801 0.801 0.801 0.801 0.801
##
## $`miR-124`
## original 0.2 0.3 0.4 0.5
## 0.801 0.801 0.768 0.534 0.425
##
## $`miR-137`
## original 0.2 0.3 0.4 0.5
## 0.768 0.768 0.768 0.768 0.568
##
## $`miR-139`
## original 0.2 0.3 0.4 0.5
## 0.408 0.309 0.325 0.115 0.084
##
## $`miR-143`
## original 0.2 0.3 0.4 0.5
## 0.824 0.824 0.824 0.825 0.771
##
## $`miR-144`
## original 0.2 0.3 0.4 0.5
## 0.825 0.528 0.258 0.232 0.274
##
## $`miR-153`
## original 0.2 0.3 0.4 0.5
## 0.768 0.768 0.534 0.434 0.359
##
## $`miR-155`
## original 0.2 0.3 0.4 0.5
## 0.780 0.780 0.777 0.777 0.777
##
## $`miR-182`
## original 0.2 0.3 0.4 0.5
## 0.768 0.414 0.404 0.390 0.279
##
## $`miR-199a`
## original 0.2 0.3 0.4 0.5
## 0.824 0.785 0.785 0.780 0.780
##
## $`miR-204`
## original 0.2 0.3 0.4 0.5
## 0.795 0.780 0.746 0.775 0.743
##
## $`miR-205`
## original 0.2 0.3 0.4 0.5
## 0.824 0.732 0.824 0.720 0.593
##
## $`miR-216b`
## original 0.2 0.3 0.4 0.5
## 0.713 0.492 0.383 0.112 0.108
##
## $`miR-223`
## original 0.2 0.3 0.4 0.5
## 0.824 0.824 0.824 0.824 0.795
##
## $`miR-7`
## original 0.2 0.3 0.4 0.5
## 0.824 0.785 0.824 0.785 0.795
# mean scores over all methods
ggplot(df, aes(x=prop, y=detPPV, fill=method)) + geom_violin(draw_quantiles = 0.5) + facet_wrap(~method) +
guides(fill=FALSE)mean.methods <- lapply( unique(df$method), FUN=function(x)
sapply( unique(df$prop), function(y)
round( mean(df$detPPV[df$method==x & df$prop==y], na.rm=TRUE), 3)
)
)
names(mean.methods) <- unique(df$method)
for(i in unique(df$method)){
names(mean.methods[[i]]) <- props.all
}
mean.methods## $aREAmir
## original 0.2 0.3 0.4 0.5
## 0.969 0.849 0.774 0.730 0.620
##
## $EN.up
## original 0.2 0.3 0.4 0.5
## 0.005 0.005 0.005 0.005 0.005
##
## $EN.down
## original 0.2 0.3 0.4 0.5
## 1.000 0.881 0.806 0.754 0.706
##
## $EN.combined
## original 0.2 0.3 0.4 0.5
## 0.950 0.857 0.779 0.743 0.633
##
## $wEN.up
## original 0.2 0.3 0.4 0.5
## 0.005 0.005 0.005 0.006 0.005
##
## $wEN.down
## original 0.2 0.3 0.4 0.5
## 1.000 0.938 0.885 0.776 0.757
##
## $wEN.combined
## original 0.2 0.3 0.4 0.5
## 1.000 0.938 0.896 0.760 0.741
##
## $michael.up
## original 0.2 0.3 0.4 0.5
## 0.005 0.005 0.005 0.005 0.005
##
## $michael.down
## original 0.2 0.3 0.4 0.5
## 1.000 0.906 0.854 0.760 0.683
##
## $michael.combined
## original 0.2 0.3 0.4 0.5
## 1.000 0.938 0.865 0.806 0.766
##
## $MW
## original 0.2 0.3 0.4 0.5
## 0.939 0.834 0.782 0.681 0.624
##
## $KS
## original 0.2 0.3 0.4 0.5
## 0.847 0.743 0.784 0.663 0.588
##
## $KS2
## original 0.2 0.3 0.4 0.5
## 0.958 0.858 0.856 0.755 0.675
##
## $GSEA
## original 0.2 0.3 0.4 0.5
## 0.777 0.710 0.724 0.698 0.602
##
## $GSEAmodified
## original 0.2 0.3 0.4 0.5
## 0.730 0.587 0.524 0.478 0.476
##
## $modSites
## original 0.2 0.3 0.4 0.5
## 0.938 0.875 0.818 0.774 0.755
##
## $modScore
## original 0.2 0.3 0.4 0.5
## 0.943 0.876 0.876 0.829 0.757
# sensitivity score (ratio of TP at FDR 0.05)
sens.methods <- lapply( unique(df$method), FUN=function(x)
sapply( unique(df$prop), function(y)
round( sum(df$TP.atFDR05==1 & df$method==x & df$prop==y) / sum(df$method==x & df$prop==y), 3)
)
)
names(sens.methods) <- unique(df$method)
for(i in unique(df$method)){
names(sens.methods[[i]]) <- props.all
}
sens.methods## $aREAmir
## original 0.2 0.3 0.4 0.5
## 0.875 0.750 0.812 0.625 0.500
##
## $EN.up
## original 0.2 0.3 0.4 0.5
## 0.125 0.000 0.000 0.062 0.000
##
## $EN.down
## original 0.2 0.3 0.4 0.5
## 1 1 1 1 1
##
## $EN.combined
## original 0.2 0.3 0.4 0.5
## 1 1 1 1 1
##
## $wEN.up
## original 0.2 0.3 0.4 0.5
## 0 0 0 0 0
##
## $wEN.down
## original 0.2 0.3 0.4 0.5
## 1 1 1 1 1
##
## $wEN.combined
## original 0.2 0.3 0.4 0.5
## 1 1 1 1 1
##
## $michael.up
## original 0.2 0.3 0.4 0.5
## 0 0 0 0 0
##
## $michael.down
## original 0.2 0.3 0.4 0.5
## 1.000 1.000 1.000 0.938 0.938
##
## $michael.combined
## original 0.2 0.3 0.4 0.5
## 1.000 1.000 1.000 0.938 0.938
##
## $MW
## original 0.2 0.3 0.4 0.5
## 1.000 1.000 1.000 0.938 0.938
##
## $KS
## original 0.2 0.3 0.4 0.5
## 1.000 1.000 1.000 0.938 0.938
##
## $KS2
## original 0.2 0.3 0.4 0.5
## 0.938 0.875 0.812 0.750 0.625
##
## $GSEA
## original 0.2 0.3 0.4 0.5
## 0.125 0.125 0.125 0.125 0.188
##
## $GSEAmodified
## original 0.2 0.3 0.4 0.5
## 0.438 0.438 0.438 0.438 0.500
##
## $modSites
## original 0.2 0.3 0.4 0.5
## 0.938 0.938 0.875 0.875 0.875
##
## $modScore
## original 0.2 0.3 0.4 0.5
## 1.000 1.000 1.000 0.875 0.938